R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: ## Import all the libraries

importlib <- c("ggplot2", "stringr", "magrittr", "futile.logger", "VennDiagram", "tm", "SnowballC", "wordcloud", "RColorBrewer", "lattice", "caret", "rpart", "rpart.plot", "randomForest", "e1071", "ROCR", "gmodels", "mime", "plotly")

require(importlib)
## Loading required package: importlib
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'importlib'
lapply(importlib, require, character.only = TRUE)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.1
## Loading required package: stringr
## Warning: package 'stringr' was built under R version 3.4.1
## Loading required package: magrittr
## Warning: package 'magrittr' was built under R version 3.4.1
## Loading required package: futile.logger
## Warning: package 'futile.logger' was built under R version 3.4.1
## Loading required package: VennDiagram
## Warning: package 'VennDiagram' was built under R version 3.4.1
## Loading required package: grid
## Loading required package: tm
## Warning: package 'tm' was built under R version 3.4.1
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.4.1
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## Loading required package: SnowballC
## Warning: package 'SnowballC' was built under R version 3.4.1
## Loading required package: wordcloud
## Warning: package 'wordcloud' was built under R version 3.4.1
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 3.4.1
## Loading required package: lattice
## Loading required package: caret
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 3.4.1
## Loading required package: rpart.plot
## Warning: package 'rpart.plot' was built under R version 3.4.1
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 3.4.1
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## Loading required package: e1071
## Loading required package: ROCR
## Warning: package 'ROCR' was built under R version 3.4.2
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.1
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:wordcloud':
## 
##     textplot
## The following object is masked from 'package:stats':
## 
##     lowess
## Loading required package: gmodels
## Warning: package 'gmodels' was built under R version 3.4.1
## Loading required package: mime
## Warning: package 'mime' was built under R version 3.4.1
## Loading required package: plotly
## Warning: package 'plotly' was built under R version 3.4.1
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
## 
## [[3]]
## [1] TRUE
## 
## [[4]]
## [1] TRUE
## 
## [[5]]
## [1] TRUE
## 
## [[6]]
## [1] TRUE
## 
## [[7]]
## [1] TRUE
## 
## [[8]]
## [1] TRUE
## 
## [[9]]
## [1] TRUE
## 
## [[10]]
## [1] TRUE
## 
## [[11]]
## [1] TRUE
## 
## [[12]]
## [1] TRUE
## 
## [[13]]
## [1] TRUE
## 
## [[14]]
## [1] TRUE
## 
## [[15]]
## [1] TRUE
## 
## [[16]]
## [1] TRUE
## 
## [[17]]
## [1] TRUE
## 
## [[18]]
## [1] TRUE
## 
## [[19]]
## [1] TRUE

Load the dataset from disk.

Spam_SMS <- read.csv("./SMS_Spam_Dataset.csv", stringsAsFactors = F)
str(Spam_SMS)
## 'data.frame':    5572 obs. of  5 variables:
##  $ v1 : chr  "ham" "ham" "spam" "ham" ...
##  $ v2 : chr  "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question("| __truncated__ "U dun say so early hor... U c already then say..." ...
##  $ X  : chr  "" "" "" "" ...
##  $ X.1: chr  "" "" "" "" ...
##  $ X.2: chr  "" "" "" "" ...

Clean the data.

The first two columns of data are not labelled appropriately. Also, data has three null columns (X, X.1 and X.2). Therefore, I would remove the null columns and name the remaining columns appropriately.

# Remove Null Columns.
Spam_SMS$X <- NULL
Spam_SMS$X.1 <- NULL
Spam_SMS$X.2 <- NULL

# Assign appropriate names to the columns.
names(Spam_SMS) <- c("MessageLabel","Message")

# Check if any other NULL values exist in the dataset.
colSums(is.na(Spam_SMS))
## MessageLabel      Message 
##            0            0
# Convert class into factor.
levels(as.factor(Spam_SMS$MessageLabel))
## [1] "ham"  "spam"
# Assign appropriate names to the data entries under Column "Message_Label"
Spam_SMS$MessageLabel[Spam_SMS$MessageLabel == "ham"] <- "Legitimate"
Spam_SMS$MessageLabel[Spam_SMS$MessageLabel == "spam"] <- "Spam"

# Convert class into factor.
Spam_SMS$MessageLabel <- factor(Spam_SMS$MessageLabel)

Explore the data

Explore the distribution of Spam and Legitimate Messages.ThisThe distribution would hep in validating the distribution of Spam SMS and Legitimate SMS in training and test sets.

# Produce a data frame displaying the total number of legitmate messages and spam messages.
Distribution <- as.data.frame(table(Spam_SMS$MessageLabel))

# Calculate percentage for each type of Message Label. 
Distribution$Percentage <- (Distribution$Freq/nrow(Spam_SMS))*100
Distribution$Percentage <- round(Distribution$Percentage, digits = 2)
names(Distribution) <- c("Label", "Total", "Percentage")

# Plot the Distribution using plotly.
attach(Distribution)

List <- list(
     zeroline=FALSE,
     showline=FALSE,
     showticklabels=FALSE,
     showgrid=FALSE
 )
  
plot_ly(Distribution, labels=Label, values = Percentage, type="pie", hole=0.2, showlegend = T) %>% layout(title = "Distribution of Spam Messages v/s Legitimate Messages", xaxis=List, yaxis=List, showlegend = TRUE)

The pie-chart reveals that 86% of all the SMS messages in the dataset are Legitimate messages, while 13% of them are Spam messages.

To know the length of each text so as to be able to explore the data more.

# Count the number of characters in each Message.
Spam_SMS$MessageLength <- nchar(Spam_SMS$Message)

# Find the maximum length of Legitimate Message.
max(Spam_SMS$MessageLength[Spam_SMS$MessageLabel == "Legitimate"])
## [1] 910
# Find the maximum length of Spam Message.
max(Spam_SMS$MessageLength[Spam_SMS$MessageLabel == "Spam"])
## [1] 224
# Find the minimum length of Legitimate Message.
min(Spam_SMS$MessageLength[Spam_SMS$MessageLabel == "Legitimate"])
## [1] 2
# Find the minimum length of Spam Message.
min(Spam_SMS$MessageLength[Spam_SMS$MessageLabel == "Spam"])
## [1] 13

Plot the distribution of Legitimate and Spam messages v/s the Message Length.

ggplot(Spam_SMS, aes(x = MessageLength, fill = MessageLabel)) +
  theme_bw() +
  geom_histogram(binwidth = 5) +
  labs(y = "Number of Messages", x = "Length of Message",
       title = "Distribution of Message Lengths with Class Labels")

This plot helps us understand the following: The most common length of Spam SMS is 160 characters. The most common length of Legitimate SMS is 20 characters. But, the length of legitimate SMS can be as short as just an “OK” in the text, that is only 2 characters, to being as long as having 900-1000 characters in the text. Moreover, legitimate SMS and spam SMS overlap each other for the entire range of Spam SMS. Therefore, analyzing Length of Message VS Number of Texts for each label did not provide much of information on features that differentite Spam from Ham.

Split Raw SMS Data on Labels (Spam and Legitmate) and produce wordclouds for each. Using Wordcloud would help understand frequent words. More frequent the word, larger the font will be for it. Producing wordclouds would give a better understanding of all the features that differentiate Spam SMSs from Legitimate SMSs.

# Splitting Raw SMS Data on Labels (Spam and Legitmate). 
Spam_Raw <- subset(Spam_SMS, MessageLabel == "Spam")
Legitimate_Raw <- subset(Spam_SMS, MessageLabel == "Legitimate")

# Produce wordcloud for Spam_Raw
pal = brewer.pal(6,"Dark2")
wordcloud(Spam_Raw$Message, max.words = 30, scale=c(6, .3), colors = pal)

The wordcloud reveals that the most frequent words in Spam messages are: Call, Free, Now, Mobile, Text and Prize.

# Produce wordcloud for Legitimate_Raw
wordcloud(Legitimate_Raw$Message, max.words = 30, scale=c(4, .3), colors = pal)

The wordcloud reveals that the most frequent words in legitimate messages are: Can, Will, Now, Just, etc.

To convert all the words to lower case. Post that, run for loops for words manually selected as differentiating features for Spam SMSs, and for words revealed frequent by the above wordcloud produced for spam messages. This would be followed by correct assignment of ‘y’ or ‘n’ for each message in the dataset. (‘y’ corresponds to availability of that word in a particular SMS while ‘n’ corresponds to non-availability of that word in the SMS)

# Transformation of all tokens to lower case.
Spam_SMS$Message %<>% str_to_lower()

# For loop for token 'free'
Spam_SMS$free <- "n"
for(i in 1:nrow(Spam_SMS)){
  if(str_detect(Spam_SMS$Message[i], "free")  == TRUE){
    Spam_SMS$free[i] <- "y"
  }
}

# For loop for token 'winner, win, won, award, selected, prize and claim'
Spam_SMS$winner <- "n"
for(i in 1:nrow(Spam_SMS)){
  if(str_detect(Spam_SMS$Message[i], "winner")  == TRUE){
    Spam_SMS$winner[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "win")  == TRUE){
   Spam_SMS$winner[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "won")  == TRUE){
   Spam_SMS$winner[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "award")  == TRUE){
   Spam_SMS$winner[i] <- "y"
    }
    if(str_detect(Spam_SMS$Message[i], "selected")  == TRUE){
   Spam_SMS$winner[i] <- "y"
    }
  if(str_detect(Spam_SMS$Message[i], "prize")  == TRUE){
   Spam_SMS$winner[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "claim")  == TRUE){
   Spam_SMS$winner[i] <- "y"
  }
}

# For loop for token 'congratulations, congrats'
Spam_SMS$congratulation <- "n"
for(i in 1:nrow(Spam_SMS)){
  if(str_detect(Spam_SMS$Message[i], "congrats")  == TRUE){
    Spam_SMS$congratulation[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "congratulations")  == TRUE){
    Spam_SMS$congratulation[i] <- "y"
  }
}

# For loop for token 'xxx, babe, naked, dirty, flirty'
Spam_SMS$adult <- "n"
for(i in 1:nrow(Spam_SMS)){
  if(str_detect(Spam_SMS$Message[i], "xxx")  == TRUE){
    Spam_SMS$adult[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "babe")  == TRUE){
    Spam_SMS$adult[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "naked")  == TRUE){
    Spam_SMS$adult[i] <- "y"
  }
    if(str_detect(Spam_SMS$Message[i], "dirty")  == TRUE){
    Spam_SMS$adult[i] <- "y"
    }
    if(str_detect(Spam_SMS$Message[i], "flirty")  == TRUE){
    Spam_SMS$adult[i] <- "y"
    }
}

# For loop for token 'urgent, attention, bonus, immediately, now, stop'
Spam_SMS$attention <- "n"
for(i in 1:nrow(Spam_SMS)){
  if(str_detect(Spam_SMS$Message[i], "urgent")  == TRUE){
    Spam_SMS$attention[i] <- "y"
  }
    if(str_detect(Spam_SMS$Message[i], "attention")  == TRUE){
    Spam_SMS$attention[i] <- "y"
    }
    if(str_detect(Spam_SMS$Message[i], "bonus")  == TRUE){
    Spam_SMS$attention[i] <- "y"
      }
    if(str_detect(Spam_SMS$Message[i], "immediately")  == TRUE){
    Spam_SMS$attention[i] <- "y"
    }
  if(str_detect(Spam_SMS$Message[i], "now")  == TRUE){
   Spam_SMS$attention[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "stop")  == TRUE){
   Spam_SMS$attention[i] <- "y"
  }
}

# For loop for token 'ringtone, call, mobile, text, txt'
Spam_SMS$ringtone  <- "n"
for(i in 1:nrow(Spam_SMS)){
  if(str_detect(Spam_SMS$Message[i], "ringtone")  == TRUE){
    Spam_SMS$ringtone[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "call")  == TRUE){
   Spam_SMS$ringtone[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "mobile")  == TRUE){
   Spam_SMS$ringtone[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "text")  == TRUE){
   Spam_SMS$ringtone[i] <- "y"
  }
  if(str_detect(Spam_SMS$Message[i], "txt")  == TRUE){
   Spam_SMS$ringtone[i] <- "y"
  }
}

After having this chunk run, there are 6 more columns added to the dataset (Spam_SMS) with values = y or n, depending on the availability of the keywords in messages.

Plot bar graph depicting total number of spam messages with the value of these features being equal to “y”.

# For each category only in Spam SMS

# Produce a data frame 'Spam_Features' containing Features and the total number of messages containing that feature.
Spam_Features <- data.frame(Features = c("Free", "Adult", "Ringtone", "Congratulation", "Winner", "Attention"), Total = c(sum(Spam_SMS$free == "y" & Spam_SMS$MessageLabel == "Spam"), sum(Spam_SMS$adult == "y" & Spam_SMS$MessageLabel == "Spam"), sum(Spam_SMS$ringtone == "y" & Spam_SMS$MessageLabel == "Spam"), sum(Spam_SMS$congratulation == "y" & Spam_SMS$MessageLabel == "Spam"), sum(Spam_SMS$winner == "y" & Spam_SMS$MessageLabel == "Spam"), sum(Spam_SMS$attention == "y" & Spam_SMS$MessageLabel == "Spam")))

# Plot the data frame.
ggplot(Spam_Features, aes(x = reorder(Features, -Total), y = Total)) + geom_bar(stat = "identity", fill = "steelblue") + geom_text(aes(label = Total), color = "red", vjust = 0) + xlab("Features")+ ylab("Total Number of Messages")

The plot reveals that the most frequently used keywords fall under the categories: Ringtone, Attention and Winner, while the least frequently used keywords fall under the categories: Congratulations, Adult and Free.

Plot bar graph depicting total number of messages with the value of these features being equal to “y”.

# For each category in All SMS

# Produce a data frame 'Spam_Features_All' containing Features and the total number of messages containing that feature.
Spam_Features_All <- data.frame(Features = c("Free", "Adult", "Ringtone", "Congratulation", "Winner", "Attention"), Total = c(sum(Spam_SMS$free == "y"), sum(Spam_SMS$adult == "y"), sum(Spam_SMS$ringtone == "y"), sum(Spam_SMS$congratulation == "y"), sum(Spam_SMS$winner == "y"), sum(Spam_SMS$attention == "y")))

# Plot the data frame.
ggplot(Spam_Features_All, aes(x = reorder(Features, -Total), y = Total)) + geom_bar(stat = "identity", fill = "steelblue") + geom_text(aes(label = Total), color = "red", vjust = 0) + xlab("Features")+ ylab("Total Number of Messages")

The plot reveals that the most frequently used keywords fall under the categories: Ringtone, Attention and Winner, while the least frequently used keywords fall under the categories: Congratulations, Adult and Free. The comaprison between two bar plots reveal that the number of Spam SMS containing those words exceed the number of Legitimate SMS containing those words by 54%.

Text Analysis

To make the data ready for building predictive models. In this, we use text-mining package (package tm) to manage the documents.

# create a Corpus of Messages in Spam_SMS. 
BagOfWords <- Corpus(VectorSource(Spam_SMS$Message))

# Clean corpus.
Clean_BagOfWords <- BagOfWords %>%
                    tm_map(content_transformer(tolower)) %>% # Transofrm to lower case
                    tm_map(removeNumbers) %>%                # Clean by removing numbers
                    tm_map(removeWords, stopwords(kind="en")) %>% # Clean by removing stopwords
                    tm_map(removePunctuation) %>%            # Clean by removing punctuation
                    tm_map(stripWhitespace)                  # Clean by tokenising by striping white space

# Transform corpus into matrix.
TDM = DocumentTermMatrix(Clean_BagOfWords)

SparseWords <- removeSparseTerms(TDM, 0.995)

# Transform the matrix of Sparsewords into data frame.
SparseWords <- as.data.frame(as.matrix(SparseWords))

# Rename column names.
colnames(SparseWords) <- make.names(colnames(SparseWords))

str(SparseWords)
## 'data.frame':    5572 obs. of  290 variables:
##  $ got       : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ great     : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ wat       : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ world     : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ lar       : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ apply     : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ free      : num  0 0 1 0 0 0 0 0 0 2 ...
##  $ may       : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ receive   : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ text      : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ txt       : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ win       : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ already   : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ dun       : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ early     : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ say       : num  0 0 0 2 0 0 0 0 0 0 ...
##  $ around    : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ think     : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ back      : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ fun       : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ hey       : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ like      : num  0 0 0 0 0 1 2 0 0 0 ...
##  $ now       : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ send      : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ still     : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ word      : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ xxx       : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ even      : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ speak     : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ friends   : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ per       : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ call      : num  0 0 0 0 0 0 0 0 1 1 ...
##  $ claim     : num  0 0 0 0 0 0 0 0 2 0 ...
##  $ code      : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ customer  : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ network   : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ prize     : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ selected  : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ camera    : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ latest    : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ mobile    : num  0 0 0 0 0 0 0 0 0 2 ...
##  $ enough    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ gonna     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ home      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ soon      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ stuff     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ talk      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ today     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ tonight   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ want      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ cash      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ cost      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ days      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ reply     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ pobox     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ urgent    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ week      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ won       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ help      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ right     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ take      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ thank     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ will      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ wont      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ message   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ next.     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ use       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ watching  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ make      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ name      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ remember  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ yes       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ feel      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fine      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ way       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dont      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ miss      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ going     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ try       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ first     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ finish    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ lor       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ lunch     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ can       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ meet      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ eat       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ getting   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ just      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ lol       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ really    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ always    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ bus       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ dinner    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ left      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ love      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ amp       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ car       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ know      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ let       : num  0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]
SparseWords$MessageLabel <- Spam_SMS$MessageLabel

Classification Process to accurately classify SMS messages into Spam messages or Legitimate messages.

Splitting the data in a ratio of 7:3: 70% to build the predictive model and 30% to test the model. I am splitting the dataset, Sparsewords, Corpus(BagOfWords) and the Term Document Matrix.

# Random number generation using set.seed of 1234.
set.seed(1234)

# Create a split formula using which I would split the data into train and test sets.
Split_Formula <- createDataPartition(Spam_SMS$MessageLabel, p=0.7, list=FALSE)

# Split Spam_SMS into training and test sets.
train_data <- Spam_SMS[Split_Formula,]
test_data <- Spam_SMS[-Split_Formula,]

# Split SparseWords into training and test sets.
Sparse_train_data <- SparseWords[Split_Formula,]
Sparse_test_data <- SparseWords[-Split_Formula,]

# Split corpus into training and test data.
Corpus_train_data <- Clean_BagOfWords[Split_Formula]
Corpus_test_data <- Clean_BagOfWords[-Split_Formula]

# Split Term Document Matrix into training and test data.
TDM_train_data <- TDM[Split_Formula,]
TDM_test_data <- TDM[-Split_Formula,]

Validate distribution of Spam and Ham SMS in Training and Test sets.

prop.table(table(train_data$MessageLabel))
## 
## Legitimate       Spam 
##  0.8659318  0.1340682
prop.table(table(test_data$MessageLabel))
## 
## Legitimate       Spam 
##  0.8659485  0.1340515

It validates that the data has been split perfectly.

Producing Wordcloud of the cleaned Corpus for analysis.

wordcloud(Clean_BagOfWords, max.words = 75, random.order = FALSE, scale=c(5, .3), colors = pal)

The wordcloud reveals that the most frequent words in Clean Corpus(mix of Legitimate and Spam messages) are: Call, Can, Now, Get, Just, Will, Free, etc. Therefore, it is evident that this wordcloud substantiates the two wordclouds produced above (each for spam an legitimate messages) as this wordcloud has a mix of the frequent words shown in those wordclouds (like: Free, Call, Can, Just)

Split train_data on Labels (Spam) and produce wordclouds for it. Using Wordcloud would help understand frequent words. More frequent the word, larger the font will be for it.

# Splitting train_data on Labels (Spam and Legitmate).
Spam <- subset(train_data, MessageLabel == "Spam")

# Produce wordcloud for Spam
wordcloud(Spam$Message, max.words = 30, scale=c(7, .3), colors = pal)

The wordcloud reveals that the most frequent words in Spam messages for train data are: Call, Free, Now, Claim. Text, etc. They are the same as the ones displayed in the wordcloud for Spam messages in Spam_SMS dataset. Hence, this shows that the data has been correctly splitted into training set.

Split test_data on Labels (Spam) and produce wordclouds for it. Using Wordcloud would help understand frequent words. More frequent the word, larger the font will be for it.

# Splitting train_data on Labels (Spam and Legitmate).
Spam_test <- subset(test_data, MessageLabel == "Spam")

# Produce wordcloud for Spam
wordcloud(Spam_test$Message, max.words = 30, scale=c(5, .3), colors = pal)

The wordcloud reveals that the most frequent words in Spam messages for train data are: Call, Free, Now, Claim. Text, etc. They are the same as the ones displayed in the wordcloud for Spam messages in Spam_SMS dataset. Hence, this shows that the data has been correctly splitted into test set.

Setting 1: Building models for all the features of Spam SMS.

Decision Tree Model

# Build a recursive partitioning decision tree.
SMS_Rpart_All <- rpart(formula = MessageLabel ~., data = Sparse_train_data, method = "class")

rpart.plot(SMS_Rpart_All, type = 4, fallen.leaves = FALSE, extra = 4)

This tree reveals that out of all these tokens, the most important token is ‘call’ and the least important ones being ‘mobile and stop’.

summary(SMS_Rpart_All)
## Call:
## rpart(formula = MessageLabel ~ ., data = Sparse_train_data, method = "class")
##   n= 3901 
## 
##            CP nsplit rel error    xerror       xstd
## 1  0.15487572      0 1.0000000 1.0000000 0.04069031
## 2  0.15296367      1 0.8451243 0.9598470 0.03998875
## 3  0.06883365      2 0.6921606 0.6921606 0.03465013
## 4  0.01912046      3 0.6233270 0.6271511 0.03314086
## 5  0.01816444      4 0.6042065 0.6443595 0.03355012
## 6  0.01720841      6 0.5678776 0.6271511 0.03314086
## 7  0.01529637      8 0.5334608 0.6118547 0.03277083
## 8  0.01434034      9 0.5181644 0.5984704 0.03244208
## 9  0.01338432     11 0.4894837 0.5984704 0.03244208
## 10 0.01000000     12 0.4760994 0.5449331 0.03107756
## 
## Variable importance
##       call        txt      claim       text      later      reply 
##         26         19          8          7          4          4 
##      prize      sorry       stop     urgent        ppm        won 
##          3          2          2          2          2          2 
##       draw        can       free     mobile    awarded      nokia 
##          2          2          1          1          1          1 
##       tone        yes      pobox guaranteed       send 
##          1          1          1          1          1 
## 
## Node number 1: 3901 observations,    complexity param=0.1548757
##   predicted class=Legitimate  expected loss=0.1340682  P(node) =1
##     class counts:  3378   523
##    probabilities: 0.866 0.134 
##   left son=2 (3524 obs) right son=3 (377 obs)
##   Primary splits:
##       call   < 0.5 to the left,  improve=187.02190, (0 missing)
##       txt    < 0.5 to the left,  improve=129.74050, (0 missing)
##       claim  < 0.5 to the left,  improve=127.17900, (0 missing)
##       free   < 0.5 to the left,  improve=114.55070, (0 missing)
##       mobile < 0.5 to the left,  improve= 95.84149, (0 missing)
##   Surrogate splits:
##       prize  < 0.5 to the left,  agree=0.914, adj=0.106, (0 split)
##       claim  < 0.5 to the left,  agree=0.912, adj=0.088, (0 split)
##       urgent < 0.5 to the left,  agree=0.912, adj=0.085, (0 split)
##       won    < 0.5 to the left,  agree=0.911, adj=0.082, (0 split)
##       ppm    < 0.5 to the left,  agree=0.911, adj=0.082, (0 split)
## 
## Node number 2: 3524 observations,    complexity param=0.1529637
##   predicted class=Legitimate  expected loss=0.08342792  P(node) =0.9033581
##     class counts:  3230   294
##    probabilities: 0.917 0.083 
##   left son=4 (3426 obs) right son=5 (98 obs)
##   Primary splits:
##       txt  < 0.5 to the left,  improve=137.13040, (0 missing)
##       free < 0.5 to the left,  improve= 80.41720, (0 missing)
##       stop < 0.5 to the left,  improve= 59.43363, (0 missing)
##       win  < 0.5 to the left,  improve= 55.76072, (0 missing)
##       text < 0.5 to the left,  improve= 51.42719, (0 missing)
##   Surrogate splits:
##       draw    < 0.5 to the left,  agree=0.974, adj=0.082, (0 split)
##       nokia   < 1.5 to the left,  agree=0.973, adj=0.041, (0 split)
##       awarded < 0.5 to the left,  agree=0.973, adj=0.041, (0 split)
##       tone    < 0.5 to the left,  agree=0.973, adj=0.041, (0 split)
##       tcs     < 0.5 to the left,  agree=0.973, adj=0.020, (0 split)
## 
## Node number 3: 377 observations,    complexity param=0.06883365
##   predicted class=Spam        expected loss=0.3925729  P(node) =0.09664189
##     class counts:   148   229
##    probabilities: 0.393 0.607 
##   left son=6 (36 obs) right son=7 (341 obs)
##   Primary splits:
##       later  < 0.5 to the right, improve=29.37026, (0 missing)
##       sorry  < 0.5 to the right, improve=24.09157, (0 missing)
##       claim  < 0.5 to the left,  improve=21.12756, (0 missing)
##       prize  < 0.5 to the left,  improve=17.35938, (0 missing)
##       urgent < 0.5 to the left,  improve=14.56856, (0 missing)
##   Surrogate splits:
##       sorry   < 0.5 to the right, agree=0.960, adj=0.583, (0 split)
##       meeting < 0.5 to the right, agree=0.915, adj=0.111, (0 split)
## 
## Node number 4: 3426 observations,    complexity param=0.01912046
##   predicted class=Legitimate  expected loss=0.05983654  P(node) =0.8782363
##     class counts:  3221   205
##    probabilities: 0.940 0.060 
##   left son=8 (3334 obs) right son=9 (92 obs)
##   Primary splits:
##       text  < 0.5 to the left,  improve=46.23725, (0 missing)
##       free  < 0.5 to the left,  improve=41.09870, (0 missing)
##       reply < 0.5 to the left,  improve=37.11689, (0 missing)
##       stop  < 0.5 to the left,  improve=35.03827, (0 missing)
##       claim < 0.5 to the left,  improve=31.98873, (0 missing)
##   Surrogate splits:
##       free    < 2.5 to the left,  agree=0.974, adj=0.043, (0 split)
##       pobox   < 0.5 to the left,  agree=0.974, adj=0.033, (0 split)
##       message < 1.5 to the left,  agree=0.974, adj=0.022, (0 split)
##       video   < 0.5 to the left,  agree=0.973, adj=0.011, (0 split)
## 
## Node number 5: 98 observations
##   predicted class=Spam        expected loss=0.09183673  P(node) =0.02512176
##     class counts:     9    89
##    probabilities: 0.092 0.908 
## 
## Node number 6: 36 observations
##   predicted class=Legitimate  expected loss=0  P(node) =0.009228403
##     class counts:    36     0
##    probabilities: 1.000 0.000 
## 
## Node number 7: 341 observations,    complexity param=0.01720841
##   predicted class=Spam        expected loss=0.3284457  P(node) =0.08741348
##     class counts:   112   229
##    probabilities: 0.328 0.672 
##   left son=14 (283 obs) right son=15 (58 obs)
##   Primary splits:
##       claim  < 0.5 to the left,  improve=15.07833, (0 missing)
##       can    < 0.5 to the right, improve=13.87236, (0 missing)
##       prize  < 0.5 to the left,  improve=12.34596, (0 missing)
##       urgent < 0.5 to the left,  improve=10.33451, (0 missing)
##       won    < 0.5 to the left,  improve= 9.50100, (0 missing)
##   Surrogate splits:
##       guaranteed < 0.5 to the left,  agree=0.871, adj=0.241, (0 split)
##       prize      < 0.5 to the left,  agree=0.862, adj=0.190, (0 split)
##       draw       < 0.5 to the left,  agree=0.859, adj=0.172, (0 split)
##       hrs        < 0.5 to the left,  agree=0.859, adj=0.172, (0 split)
##       selected   < 0.5 to the left,  agree=0.856, adj=0.155, (0 split)
## 
## Node number 8: 3334 observations,    complexity param=0.01816444
##   predicted class=Legitimate  expected loss=0.04619076  P(node) =0.8546527
##     class counts:  3180   154
##    probabilities: 0.954 0.046 
##   left son=16 (3284 obs) right son=17 (50 obs)
##   Primary splits:
##       reply < 0.5 to the left,  improve=28.92908, (0 missing)
##       claim < 0.5 to the left,  improve=20.08080, (0 missing)
##       stop  < 0.5 to the left,  improve=19.10557, (0 missing)
##       free  < 0.5 to the left,  improve=18.22569, (0 missing)
##       win   < 0.5 to the left,  improve=16.54198, (0 missing)
##   Surrogate splits:
##       stop < 1.5 to the left,  agree=0.987, adj=0.12, (0 split)
##       end  < 1.5 to the left,  agree=0.986, adj=0.04, (0 split)
##       went < 2.5 to the left,  agree=0.986, adj=0.04, (0 split)
## 
## Node number 9: 92 observations,    complexity param=0.01434034
##   predicted class=Spam        expected loss=0.4456522  P(node) =0.0235837
##     class counts:    41    51
##    probabilities: 0.446 0.554 
##   left son=18 (70 obs) right son=19 (22 obs)
##   Primary splits:
##       free   < 0.5 to the left,  improve=5.531846, (0 missing)
##       mobile < 0.5 to the left,  improve=5.481522, (0 missing)
##       stop   < 0.5 to the left,  improve=4.625020, (0 missing)
##       text   < 1.5 to the left,  improve=4.456522, (0 missing)
##       yes    < 0.5 to the left,  improve=3.480331, (0 missing)
##   Surrogate splits:
##       fun     < 0.5 to the left,  agree=0.793, adj=0.136, (0 split)
##       word    < 0.5 to the left,  agree=0.793, adj=0.136, (0 split)
##       latest  < 0.5 to the left,  agree=0.793, adj=0.136, (0 split)
##       orange  < 0.5 to the left,  agree=0.793, adj=0.136, (0 split)
##       message < 1.5 to the left,  agree=0.783, adj=0.091, (0 split)
## 
## Node number 14: 283 observations,    complexity param=0.01720841
##   predicted class=Spam        expected loss=0.3957597  P(node) =0.0725455
##     class counts:   112   171
##    probabilities: 0.396 0.604 
##   left son=28 (26 obs) right son=29 (257 obs)
##   Primary splits:
##       can    < 0.5 to the right, improve=11.615610, (0 missing)
##       mobile < 0.5 to the left,  improve=10.372500, (0 missing)
##       urgent < 0.5 to the left,  improve= 8.968500, (0 missing)
##       ppm    < 0.5 to the left,  improve= 7.842131, (0 missing)
##       mins   < 0.5 to the left,  improve= 7.472429, (0 missing)
##   Surrogate splits:
##       dont < 0.5 to the right, agree=0.919, adj=0.115, (0 split)
##       come < 1.5 to the right, agree=0.919, adj=0.115, (0 split)
##       back < 0.5 to the right, agree=0.915, adj=0.077, (0 split)
##       help < 1.5 to the right, agree=0.915, adj=0.077, (0 split)
##       sure < 0.5 to the right, agree=0.915, adj=0.077, (0 split)
## 
## Node number 15: 58 observations
##   predicted class=Spam        expected loss=0  P(node) =0.01486798
##     class counts:     0    58
##    probabilities: 0.000 1.000 
## 
## Node number 16: 3284 observations,    complexity param=0.01816444
##   predicted class=Legitimate  expected loss=0.03806334  P(node) =0.8418354
##     class counts:  3159   125
##    probabilities: 0.962 0.038 
##   left son=32 (3273 obs) right son=33 (11 obs)
##   Primary splits:
##       claim  < 0.5 to the left,  improve=20.425500, (0 missing)
##       free   < 0.5 to the left,  improve=12.278720, (0 missing)
##       cash   < 0.5 to the left,  improve=11.181800, (0 missing)
##       send   < 0.5 to the left,  improve=10.543650, (0 missing)
##       mobile < 0.5 to the left,  improve= 9.452283, (0 missing)
##   Surrogate splits:
##       apply   < 0.5 to the left,  agree=0.997, adj=0.091, (0 split)
##       receive < 0.5 to the left,  agree=0.997, adj=0.091, (0 split)
## 
## Node number 17: 50 observations,    complexity param=0.01529637
##   predicted class=Spam        expected loss=0.42  P(node) =0.01281723
##     class counts:    21    29
##    probabilities: 0.420 0.580 
##   left son=34 (34 obs) right son=35 (16 obs)
##   Primary splits:
##       stop < 0.5 to the left,  improve=8.3011760, (0 missing)
##       send < 0.5 to the left,  improve=2.0944170, (0 missing)
##       yes  < 0.5 to the left,  improve=1.2503650, (0 missing)
##       free < 0.5 to the left,  improve=0.5504762, (0 missing)
##       now  < 0.5 to the left,  improve=0.2935548, (0 missing)
##   Surrogate splits:
##       send   < 0.5 to the left,  agree=0.82, adj=0.438, (0 split)
##       see    < 0.5 to the left,  agree=0.78, adj=0.312, (0 split)
##       friend < 0.5 to the left,  agree=0.78, adj=0.312, (0 split)
##       yes    < 0.5 to the left,  agree=0.74, adj=0.188, (0 split)
##       per    < 0.5 to the left,  agree=0.72, adj=0.125, (0 split)
## 
## Node number 18: 70 observations,    complexity param=0.01434034
##   predicted class=Legitimate  expected loss=0.4571429  P(node) =0.01794412
##     class counts:    38    32
##    probabilities: 0.543 0.457 
##   left son=36 (59 obs) right son=37 (11 obs)
##   Primary splits:
##       stop   < 0.5 to the left,  improve=5.331455, (0 missing)
##       mobile < 0.5 to the left,  improve=4.584127, (0 missing)
##       now    < 0.5 to the left,  improve=2.742857, (0 missing)
##       reply  < 0.5 to the left,  improve=2.488889, (0 missing)
##       new    < 0.5 to the left,  improve=2.488889, (0 missing)
##   Surrogate splits:
##       help < 0.5 to the left,  agree=0.886, adj=0.273, (0 split)
##       live < 0.5 to the left,  agree=0.886, adj=0.273, (0 split)
##       pls  < 0.5 to the left,  agree=0.886, adj=0.273, (0 split)
##       sms  < 0.5 to the left,  agree=0.886, adj=0.273, (0 split)
##       per  < 0.5 to the left,  agree=0.871, adj=0.182, (0 split)
## 
## Node number 19: 22 observations
##   predicted class=Spam        expected loss=0.1363636  P(node) =0.00563958
##     class counts:     3    19
##    probabilities: 0.136 0.864 
## 
## Node number 28: 26 observations
##   predicted class=Legitimate  expected loss=0.1538462  P(node) =0.006664958
##     class counts:    22     4
##    probabilities: 0.846 0.154 
## 
## Node number 29: 257 observations
##   predicted class=Spam        expected loss=0.3501946  P(node) =0.06588054
##     class counts:    90   167
##    probabilities: 0.350 0.650 
## 
## Node number 32: 3273 observations
##   predicted class=Legitimate  expected loss=0.03483043  P(node) =0.8390156
##     class counts:  3159   114
##    probabilities: 0.965 0.035 
## 
## Node number 33: 11 observations
##   predicted class=Spam        expected loss=0  P(node) =0.00281979
##     class counts:     0    11
##    probabilities: 0.000 1.000 
## 
## Node number 34: 34 observations
##   predicted class=Legitimate  expected loss=0.3823529  P(node) =0.008715714
##     class counts:    21    13
##    probabilities: 0.618 0.382 
## 
## Node number 35: 16 observations
##   predicted class=Spam        expected loss=0  P(node) =0.004101512
##     class counts:     0    16
##    probabilities: 0.000 1.000 
## 
## Node number 36: 59 observations,    complexity param=0.01338432
##   predicted class=Legitimate  expected loss=0.3728814  P(node) =0.01512433
##     class counts:    37    22
##    probabilities: 0.627 0.373 
##   left son=72 (52 obs) right son=73 (7 obs)
##   Primary splits:
##       mobile < 0.5 to the left,  improve=6.24706600, (0 missing)
##       now    < 0.5 to the left,  improve=3.48210900, (0 missing)
##       can    < 0.5 to the right, improve=0.12069290, (0 missing)
##       get    < 0.5 to the right, improve=0.12069290, (0 missing)
##       just   < 0.5 to the right, improve=0.03322034, (0 missing)
##   Surrogate splits:
##       claim  < 0.5 to the left,  agree=0.949, adj=0.571, (0 split)
##       yes    < 0.5 to the left,  agree=0.949, adj=0.571, (0 split)
##       today  < 0.5 to the left,  agree=0.932, adj=0.429, (0 split)
##       pobox  < 0.5 to the left,  agree=0.932, adj=0.429, (0 split)
##       chance < 0.5 to the left,  agree=0.932, adj=0.429, (0 split)
## 
## Node number 37: 11 observations
##   predicted class=Spam        expected loss=0.09090909  P(node) =0.00281979
##     class counts:     1    10
##    probabilities: 0.091 0.909 
## 
## Node number 72: 52 observations
##   predicted class=Legitimate  expected loss=0.2884615  P(node) =0.01332992
##     class counts:    37    15
##    probabilities: 0.712 0.288 
## 
## Node number 73: 7 observations
##   predicted class=Spam        expected loss=0  P(node) =0.001794412
##     class counts:     0     7
##    probabilities: 0.000 1.000

Random Forest Classifier

Apply Random Forest to substantiate analysis of Decision Tree by plotting the importance of each token.

Sparse_train_data$MessageLabel %<>% as.factor()

#Applying the formula for Random Forest Algorithm
RFSpam_Tree_All <- randomForest(MessageLabel~., data = Sparse_train_data, ntree=25, proximity = T)

#To plot the Variable Importance Plot.
ImportancePlot <- varImpPlot(RFSpam_Tree_All, n.var=min(10, nrow(RFSpam_Tree_All$importance), main = "Importance of each Token"))

This plot also expresses that the most important token amongst all is ‘Call’.

# Importance of each token in a tabular form.
importance(RFSpam_Tree_All)
##            MeanDecreaseGini
## got             1.129843044
## great           1.992217486
## wat             0.446702151
## world           0.360151486
## lar             0.014962133
## apply           8.359609584
## free           44.730644545
## may             0.108664588
## receive         2.535971787
## text           23.121395528
## txt            66.541480150
## win            19.546809878
## already         0.154236583
## dun             0.009148150
## early           0.017496039
## say             0.058293121
## around          0.523081565
## think           0.647525269
## back            0.761893343
## fun             1.270593257
## hey             0.756755885
## like            1.222363187
## now            16.996287449
## send            6.216653240
## still           1.163799572
## word            0.800023673
## xxx             1.403111943
## even            0.722252845
## speak           0.805484986
## friends         0.128628718
## per             4.785028825
## call           54.728977599
## claim          41.325071352
## code            8.449954301
## customer        8.190163390
## network         4.116453844
## prize          28.612263782
## selected        6.228920820
## camera          4.474178231
## latest          1.690966455
## mobile         38.885107871
## enough          0.083730931
## gonna           0.267855047
## home            0.345258359
## soon            0.029957707
## stuff           0.490283381
## talk            0.996798174
## today           1.150020783
## tonight         0.030558099
## want            1.381315316
## cash            9.168959487
## cost            2.670183641
## days            1.056115024
## reply          17.604895965
## pobox           8.009708999
## urgent         13.807278517
## week            1.585380102
## won            14.366337737
## help            2.436964830
## right           0.430466976
## take            1.106420290
## thank           0.055768370
## will            3.380528339
## wont            0.014979991
## message         6.863442352
## next.           1.802818730
## use             0.821219433
## watching        0.007561013
## make            1.181532330
## name            0.440841135
## remember        0.498799213
## yes             1.840856067
## feel            0.201802542
## fine            0.045139003
## way             0.524624143
## dont            1.035113359
## miss            0.655466222
## going           0.801975707
## try             0.341694180
## first           0.222984252
## finish          0.827045617
## lor             0.072268714
## lunch           0.003704900
## can             4.901656096
## meet            0.472639242
## eat             0.126890148
## getting         0.939464802
## just            2.333222639
## lol             0.081704299
## really          0.055171989
## always          0.155837143
## bus             0.787545471
## dinner          0.211590531
## left            0.195342337
## love            1.188581835
## amp             0.913862630
## car             0.245107833
## know            1.671905059
## let             0.628356311
## room            0.002543080
## work            0.808857462
## live            0.663499532
## sure            0.493906738
## wait            0.363817198
## yeah            0.103712012
## anything        0.038175482
## tell            0.612631922
## month           0.093845929
## please          5.788080743
## thanks          0.843489451
## look            0.407441131
## msg             3.080935416
## yup             0.321275255
## done            0.126340078
## see             1.555774951
## hello           0.916241082
## trying          0.278597352
## pls             1.279677568
## weekend         0.771884327
## need            0.733358552
## sweet           0.019127472
## nokia           9.682815389
## sms             2.650057257
## tomorrow        1.126367179
## hope            0.418269017
## ltgt            1.629924275
## man             0.208155622
## well            0.323388881
## get             2.103069376
## ask             0.317410142
## bit             0.526666838
## maybe           0.007424518
## class           0.106551540
## time            1.761583468
## half            0.263761133
## morning         0.141137769
## place           1.509138678
## best            0.402439942
## give            0.866637797
## happy           0.289670831
## never           0.251862146
## sorry           2.001495806
## thought         0.455026517
## end             1.269654227
## new             8.358669748
## play            1.515139840
## find            2.245646821
## special         0.850884003
## year            0.638768519
## later           1.879784201
## meeting         0.358769726
## pick            0.665268098
## good            1.103903915
## part            0.345959377
## come            0.724686576
## check           0.407046525
## nice            0.043792750
## said            0.062473335
## awarded         8.137287862
## day             2.832583506
## hear            0.309140900
## money           0.428595694
## babe            1.661461740
## something       0.084793786
## wanna           0.983489361
## waiting         1.701743173
## cool            0.244227053
## thats           0.041219600
## much            0.038509118
## job             0.005045842
## looking         2.851463590
## stop           27.075444262
## one             1.546057134
## real            0.075630359
## bed             0.339233903
## another         0.190807302
## late            0.081519524
## night           0.264381227
## smile           0.061380327
## someone         1.504130599
## guaranteed      4.136545602
## service        12.670163837
## buy             1.050730687
## forgot          0.007465278
## nothing         0.053713253
## long            0.052125351
## yet             0.078667673
## guess           0.580505343
## dear            0.917258991
## life            0.268574936
## lot             0.321165579
## birthday        0.043060226
## aight           0.477275744
## better          0.202371082
## people          0.219287955
## cos             0.330708839
## things          0.662219924
## contact        12.613661584
## draw            7.096781313
## hrs             0.057939192
## last            0.050303654
## ppm             2.979884871
## shows           3.441632408
## went            0.003164258
## holiday         5.543301563
## account         3.648078213
## landline        2.977024627
## todays          0.357584422
## sent            0.517545748
## girl            0.335945712
## chat           13.676474990
## sir             0.312955836
## gud             0.004010999
## little          0.616334816
## luv             0.559783367
## thk             0.354139280
## house           0.037978085
## keep            0.243065529
## friend          0.933865487
## also            0.514071434
## liao            0.010277454
## coming          0.279253148
## cant            1.192202787
## ill             0.373469675
## offer           1.711636573
## guys            0.762663982
## working         0.363784254
## haha            0.013507620
## jus             0.082611140
## every           0.667378238
## dat             0.001546056
## big             0.003833740
## ready           0.844099292
## leh             0.061304560
## easy            0.769497270
## called          0.508747293
## nite            0.360399618
## start           0.716092724
## reach           0.413713602
## person          0.145665760
## everything      0.423020052
## thanx           0.009833901
## told            0.315597926
## watch           0.036409203
## asked           0.288265407
## didnt           0.181325605
## sleep           0.142827075
## min             1.148422706
## care            0.362290098
## mins            3.806485242
## video           6.388780182
## shopping        0.085055943
## plan            0.264897772
## box             9.456449821
## might           0.189505728
## baby            0.038573199
## hour            0.021203793
## phone           2.991320645
## shit            0.027758364
## dunno           0.004622116
## problem         0.491123269
## line            2.175219848
## number          1.245488623
## chance          3.111521111
## two             0.014181946
## ever            0.235790852
## minutes         0.013840917
## orange          7.299932461
## wish            0.168925271
## quite           0.238558509
## leave           0.274766682
## sat             0.133879060
## actually        0.007283192
## put             0.203333701
## god             0.401358951
## tone           12.720157793
## thing           0.166550643
## den             0.001164947
## heart           0.086852446
## mind            0.101916729
## bad             0.693325652
## tcs             2.513928970
## enjoy           1.641871324
## princess        0.007306330
## many            0.549243113
## shall           0.055420364
## kiss            0.007225108
## probably        0.249540807
## dad             0.001053391
## wan             0.432108572

Test the above Random Forest Model on test data and check the accuracy, precision, recall and F1.

Sparse_test_data$MessageLabel %<>% as.factor()

RFTest_All <- predict(RFSpam_Tree_All, newdata =Sparse_test_data)

# Confusion Matrix
RFTest_Matrix_All <- confusionMatrix(predict(RFSpam_Tree_All, newdata =Sparse_test_data), Sparse_test_data$MessageLabel)
RFTest_Matrix_All
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Legitimate Spam
##   Legitimate       1440   49
##   Spam                7  175
##                                           
##                Accuracy : 0.9665          
##                  95% CI : (0.9567, 0.9746)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8432          
##  Mcnemar's Test P-Value : 4.281e-08       
##                                           
##             Sensitivity : 0.9952          
##             Specificity : 0.7812          
##          Pos Pred Value : 0.9671          
##          Neg Pred Value : 0.9615          
##              Prevalence : 0.8659          
##          Detection Rate : 0.8618          
##    Detection Prevalence : 0.8911          
##       Balanced Accuracy : 0.8882          
##                                           
##        'Positive' Class : Legitimate      
## 
# CrossTable
CrossTable(RFTest_All, Sparse_test_data$MessageLabel, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1671 
## 
##  
##              | Sparse_test_data$MessageLabel 
##   RFTest_All | Legitimate |       Spam |  Row Total | 
## -------------|------------|------------|------------|
##   Legitimate |       1440 |         49 |       1489 | 
##              |      0.967 |      0.033 |      0.891 | 
##              |      0.995 |      0.219 |            | 
##              |      0.862 |      0.029 |            | 
## -------------|------------|------------|------------|
##         Spam |          7 |        175 |        182 | 
##              |      0.038 |      0.962 |      0.109 | 
##              |      0.005 |      0.781 |            | 
##              |      0.004 |      0.105 |            | 
## -------------|------------|------------|------------|
## Column Total |       1447 |        224 |       1671 | 
##              |      0.866 |      0.134 |            | 
## -------------|------------|------------|------------|
## 
## 

This classifier has produced a model with: 1. Precision for predicting Legitimate messages is 0.97, while for predicting spam messages is 0.94. 2. Recall for predicting Legitimate messages is 0.99, while for predicting spam messages is 0.78. Therefore, we can deduce that the probability of predicting a Legitmate message as a Spam is quite less (0.03) as compared to the probability of a spam message being predicted as a legitimate message (0.03).

Accuracy for test Data.

TestPredictability_All <- sum(RFTest_All == Sparse_test_data$MessageLabel)/ length(Sparse_test_data$MessageLabel)*100

message("Predcitability Percentage for Test Data is:")
## Predcitability Percentage for Test Data is:
print(TestPredictability_All)
## [1] 96.64871

Plot Confusion Matrix

Reference_RF_All <- factor(c("Legitimate", "Legitimate", "Spam", "Spam"))
Prediction_RF_All <- factor(c("Legitimate", "Spam","Legitimate","Spam"))
Y_All <- c(1440, 7, 49, 175)
ConfusionMatrixPlot_All <- data.frame(Reference_RF_All, Prediction_RF_All, Y_All)

# Plot
ggplot(data =  ConfusionMatrixPlot_All, mapping = aes(x = Reference_RF_All, y = Prediction_RF_All)) +
     geom_tile(aes(fill = Y_All), colour = "white") +
     geom_text(aes(label = sprintf("%1.0f", Y_All)), vjust = 1) +
     scale_fill_gradient(low = "yellow", high = "dark green") +
     theme_bw() + theme(legend.position = "none")

Support Vector Machine

SMS_SVM_All <- svm(MessageLabel ~., data = Sparse_train_data, kernel = "linear", cost = 0.1, gamma = 0.1)
SVMTest_All <- predict(SMS_SVM_All, Sparse_test_data)

# Confusion Matrix
SVM_Measure_All <- confusionMatrix(predict(SMS_SVM_All, newdata = Sparse_test_data), Sparse_test_data$MessageLabel)
SVM_Measure_All
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Legitimate Spam
##   Legitimate       1412   28
##   Spam               35  196
##                                          
##                Accuracy : 0.9623         
##                  95% CI : (0.952, 0.9709)
##     No Information Rate : 0.8659         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.8397         
##  Mcnemar's Test P-Value : 0.4497         
##                                          
##             Sensitivity : 0.9758         
##             Specificity : 0.8750         
##          Pos Pred Value : 0.9806         
##          Neg Pred Value : 0.8485         
##              Prevalence : 0.8659         
##          Detection Rate : 0.8450         
##    Detection Prevalence : 0.8618         
##       Balanced Accuracy : 0.9254         
##                                          
##        'Positive' Class : Legitimate     
## 
# CrossTable
CrossTable(SVMTest_All, Sparse_test_data$MessageLabel, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1671 
## 
##  
##              | Sparse_test_data$MessageLabel 
##  SVMTest_All | Legitimate |       Spam |  Row Total | 
## -------------|------------|------------|------------|
##   Legitimate |       1412 |         28 |       1440 | 
##              |      0.981 |      0.019 |      0.862 | 
##              |      0.976 |      0.125 |            | 
##              |      0.845 |      0.017 |            | 
## -------------|------------|------------|------------|
##         Spam |         35 |        196 |        231 | 
##              |      0.152 |      0.848 |      0.138 | 
##              |      0.024 |      0.875 |            | 
##              |      0.021 |      0.117 |            | 
## -------------|------------|------------|------------|
## Column Total |       1447 |        224 |       1671 | 
##              |      0.866 |      0.134 |            | 
## -------------|------------|------------|------------|
## 
## 

This classifier has produced a model with: 1. Precision for predicting Legitimate messages is 0.98, while for predicting spam messages is 0.85. 2. Recall for predicting Legitimate messages is 0.98, while for predicting spam messages is 0.88. Therefore, we can deduce that the probability of predicting a Legitmate message as a Spam is quite high (0.15) as compared to the probability of a spam message being predicted as a legitimate message (0.02).

Accuracy for test data.

svm.accuracy.table_All <- as.data.frame(table(Sparse_test_data$MessageLabel, SVMTest_All))
print(paste("Accuracy for SVM is:",
            100*round(((svm.accuracy.table_All$Freq[1]+svm.accuracy.table_All$Freq[4])/nrow(Sparse_test_data)), 4),
            "%"))
## [1] "Accuracy for SVM is: 96.23 %"

Plot Confusion Matrix

Reference_SVM_All <- factor(c("Legitimate", "Legitimate", "Spam", "Spam"))
Prediction_SVM_All <- factor(c("Legitimate", "Spam","Legitimate","Spam"))
Y_SVM_All <- c(1412, 35, 28, 196)
ConfusionMatrixPlot_SVM_All <- data.frame(Reference_SVM_All, Prediction_SVM_All, Y_SVM_All)

# Plot
ggplot(data =  ConfusionMatrixPlot_SVM_All, mapping = aes(x = Reference_SVM_All, y = Prediction_SVM_All)) +
     geom_tile(aes(fill = Y_SVM_All), colour = "white") +
     geom_text(aes(label = sprintf("%1.0f", Y_SVM_All)), vjust = 1) +
     scale_fill_gradient(low = "yellow", high = "dark green") +
     theme_bw() + theme(legend.position = "none")

Logistic Regression

SMS_GLM_All <- glm(MessageLabel ~., data = Sparse_train_data, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
GLMTest_All <- predict(SMS_GLM_All, Sparse_test_data, type = 'response')

#Confusion Matrix
GLM_Matrix_All <- table(Sparse_test_data$MessageLabel, GLMTest_All > 0.5)
GLM_Matrix_All
##             
##              FALSE TRUE
##   Legitimate  1415   32
##   Spam          38  186
summary(SMS_GLM_All)
## 
## Call:
## glm(formula = MessageLabel ~ ., family = "binomial", data = Sparse_train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1361  -0.0302   0.0000   0.0000   3.4952  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.106e+00  5.331e-01 -11.454  < 2e-16 ***
## got         -4.543e-01  1.471e+00  -0.309 0.757462    
## great       -1.311e-02  1.287e+00  -0.010 0.991875    
## wat         -1.720e+01  9.612e+03  -0.002 0.998572    
## world       -1.782e+01  1.545e+04  -0.001 0.999080    
## lar         -1.964e+01  1.569e+04  -0.001 0.999001    
## apply        4.504e+01  1.679e+04   0.003 0.997859    
## free         3.371e+00  8.598e-01   3.921 8.83e-05 ***
## may         -1.816e+01  1.750e+04  -0.001 0.999172    
## receive      8.233e-01  1.973e+00   0.417 0.676544    
## text         3.917e+00  9.687e-01   4.044 5.26e-05 ***
## txt          5.045e+01  3.976e+03   0.013 0.989878    
## win          5.004e+00  1.777e+00   2.816 0.004869 ** 
## already      1.828e+00  1.475e+00   1.239 0.215359    
## dun         -1.394e+01  1.197e+04  -0.001 0.999071    
## early       -2.213e+01  1.838e+04  -0.001 0.999039    
## say         -1.835e+01  9.569e+03  -0.002 0.998470    
## around       4.860e+00  1.575e+00   3.085 0.002035 ** 
## think       -2.431e+01  7.463e+03  -0.003 0.997401    
## back         1.234e-02  1.293e+00   0.010 0.992383    
## fun          2.441e+00  1.680e+00   1.453 0.146228    
## hey         -3.432e+00  2.552e+00  -1.345 0.178667    
## like         7.103e-01  1.197e+00   0.594 0.552774    
## now          2.226e+00  6.327e-01   3.519 0.000434 ***
## send         3.579e+00  9.774e-01   3.662 0.000250 ***
## still       -4.671e+00  2.937e+00  -1.590 0.111761    
## word         1.948e-01  1.844e+00   0.106 0.915887    
## xxx          6.951e+00  2.159e+00   3.219 0.001285 ** 
## even        -1.021e+00  3.217e+00  -0.317 0.750957    
## speak       -6.358e+01  8.812e+03  -0.007 0.994243    
## friends     -6.491e+00  5.818e+00  -1.116 0.264588    
## per          7.656e+00  1.934e+00   3.958 7.57e-05 ***
## call         2.175e+00  5.507e-01   3.950 7.80e-05 ***
## claim        9.191e+01  8.925e+03   0.010 0.991784    
## code         2.199e+01  2.041e+04   0.001 0.999140    
## customer     2.073e+00  1.557e+00   1.331 0.183079    
## network      7.530e+00  4.224e+00   1.783 0.074636 .  
## prize        2.936e+01  7.418e+03   0.004 0.996842    
## selected    -1.025e+00  1.121e+01  -0.091 0.927149    
## camera      -1.642e+00  3.924e+01  -0.042 0.966616    
## latest       4.061e+00  3.894e+01   0.104 0.916953    
## mobile       3.784e+00  8.384e-01   4.513 6.39e-06 ***
## enough      -2.068e+01  1.892e+04  -0.001 0.999128    
## gonna       -6.736e+01  1.259e+04  -0.005 0.995731    
## home        -8.667e+00  5.050e+00  -1.716 0.086128 .  
## soon        -1.654e+01  7.496e+03  -0.002 0.998239    
## stuff        4.467e-01  1.692e+00   0.264 0.791728    
## talk         2.794e+00  1.380e+00   2.025 0.042828 *  
## today       -1.372e+00  1.750e+00  -0.784 0.433190    
## tonight     -1.198e-01  5.571e+00  -0.022 0.982837    
## want         1.082e+00  6.602e-01   1.639 0.101227    
## cash         2.583e+00  1.393e+00   1.854 0.063698 .  
## cost         7.456e+00  2.334e+00   3.195 0.001397 ** 
## days         2.581e+00  1.928e+00   1.339 0.180587    
## reply        5.211e+00  1.291e+00   4.036 5.43e-05 ***
## pobox        9.561e+01  1.373e+04   0.007 0.994442    
## urgent       2.470e+00  2.252e+00   1.097 0.272729    
## week        -1.639e+00  1.561e+00  -1.050 0.293718    
## won          3.166e+01  1.079e+04   0.003 0.997659    
## help         6.225e+00  1.155e+00   5.390 7.05e-08 ***
## right       -7.274e+00  9.509e+00  -0.765 0.444302    
## take         1.377e-01  1.819e+00   0.076 0.939652    
## thank       -3.519e+00  2.593e+01  -0.136 0.892070    
## will         9.335e-01  7.987e-01   1.169 0.242519    
## wont        -2.368e+01  1.478e+04  -0.002 0.998721    
## message      1.240e+00  1.088e+00   1.140 0.254277    
## next.        3.664e+00  1.358e+00   2.697 0.006994 ** 
## use          2.786e+00  1.365e+00   2.041 0.041285 *  
## watching    -1.749e+01  1.819e+04  -0.001 0.999233    
## make         7.918e-02  4.635e+00   0.017 0.986370    
## name         6.671e-01  4.062e+00   0.164 0.869560    
## remember    -1.192e+02  1.925e+04  -0.006 0.995060    
## yes          3.682e-01  1.945e+00   0.189 0.849855    
## feel        -1.855e+01  1.011e+04  -0.002 0.998536    
## fine        -1.783e+01  1.756e+04  -0.001 0.999190    
## way         -2.596e+01  7.944e+03  -0.003 0.997392    
## dont         3.431e-01  1.122e+00   0.306 0.759766    
## miss         1.146e+00  1.893e+00   0.606 0.544765    
## going       -2.716e+01  6.505e+03  -0.004 0.996668    
## try         -4.359e+00  7.030e+00  -0.620 0.535254    
## first       -9.562e-01  2.110e+00  -0.453 0.650353    
## finish      -7.645e+01  1.662e+04  -0.005 0.996329    
## lor         -2.246e+01  1.260e+04  -0.002 0.998577    
## lunch       -1.715e+01  1.465e+04  -0.001 0.999066    
## can         -1.098e+00  8.960e-01  -1.226 0.220222    
## meet        -7.232e-01  3.236e+00  -0.223 0.823159    
## eat         -1.672e+01  1.478e+04  -0.001 0.999097    
## getting      3.478e+00  1.422e+00   2.446 0.014439 *  
## just        -1.406e+00  1.087e+00  -1.293 0.195952    
## lol         -2.015e+01  1.301e+04  -0.002 0.998765    
## really      -2.059e+01  9.936e+03  -0.002 0.998347    
## always      -3.268e+01  1.099e+04  -0.003 0.997628    
## bus         -1.864e+01  1.346e+04  -0.001 0.998895    
## dinner      -1.990e+01  1.602e+04  -0.001 0.999009    
## left         2.642e+00  2.013e+00   1.312 0.189432    
## love        -6.550e-01  2.277e+00  -0.288 0.773560    
## amp         -1.560e+01  8.706e+03  -0.002 0.998570    
## car         -1.660e+01  1.177e+04  -0.001 0.998874    
## know        -3.808e+00  1.947e+00  -1.956 0.050452 .  
## let         -4.275e+01  2.160e+04  -0.002 0.998421    
## room        -1.679e+01  1.526e+04  -0.001 0.999122    
## work        -1.592e+01  2.604e+03  -0.006 0.995122    
## live         6.913e-01  8.378e+00   0.083 0.934235    
## sure        -1.964e+01  1.329e+04  -0.001 0.998821    
## wait        -1.918e+01  6.353e+03  -0.003 0.997591    
## yeah        -2.306e+01  1.486e+04  -0.002 0.998762    
## anything    -2.763e+01  1.310e+04  -0.002 0.998317    
## tell        -3.038e+00  1.751e+00  -1.735 0.082715 .  
## month       -6.109e+00  3.261e+00  -1.873 0.061037 .  
## please       1.915e+00  9.694e-01   1.976 0.048152 *  
## thanks       5.366e-01  1.383e+00   0.388 0.698084    
## look         2.559e+00  2.757e+00   0.928 0.353302    
## msg          3.949e+00  1.399e+00   2.822 0.004775 ** 
## yup         -2.376e+01  1.642e+04  -0.001 0.998846    
## done        -1.733e+01  1.616e+04  -0.001 0.999144    
## see         -1.300e-02  1.330e+00  -0.010 0.992202    
## hello        4.308e+00  1.410e+00   3.054 0.002255 ** 
## trying      -1.615e+01  4.955e+04   0.000 0.999740    
## pls         -2.257e+00  1.728e+00  -1.306 0.191561    
## weekend     -9.776e-01  1.902e+00  -0.514 0.607278    
## need        -3.990e+00  2.358e+00  -1.692 0.090658 .  
## sweet       -2.316e+01  1.437e+04  -0.002 0.998715    
## nokia       -2.602e+00  1.262e+01  -0.206 0.836656    
## sms          3.329e+00  1.396e+00   2.385 0.017099 *  
## tomorrow     1.804e+00  1.911e+00   0.944 0.345156    
## hope         8.973e-02  2.740e+00   0.033 0.973880    
## ltgt        -3.289e+01  3.928e+03  -0.008 0.993318    
## man         -2.038e+01  1.427e+04  -0.001 0.998860    
## well        -2.398e+01  9.446e+03  -0.003 0.997975    
## get          4.705e-01  7.914e-01   0.595 0.552165    
## ask         -1.774e+01  9.901e+03  -0.002 0.998571    
## bit         -7.084e+01  1.307e+04  -0.005 0.995674    
## maybe       -1.479e+01  1.810e+04  -0.001 0.999348    
## class       -3.378e+01  4.196e+03  -0.008 0.993577    
## time         1.085e+00  1.183e+00   0.917 0.359216    
## half        -5.133e+00  6.764e+01  -0.076 0.939507    
## morning     -2.837e+01  8.726e+03  -0.003 0.997406    
## place        3.968e-02  1.578e+00   0.025 0.979943    
## best         2.210e+00  2.133e+00   1.036 0.300118    
## give        -5.659e-01  1.715e+00  -0.330 0.741458    
## happy       -7.980e+00  1.165e+01  -0.685 0.493404    
## never        7.241e-01  1.479e+00   0.490 0.624333    
## sorry       -9.739e-01  1.298e+00  -0.750 0.453053    
## thought     -2.165e+01  1.807e+04  -0.001 0.999044    
## end          1.404e+00  5.677e+00   0.247 0.804705    
## new          3.937e+00  9.170e-01   4.293 1.76e-05 ***
## play         9.409e-01  2.450e+00   0.384 0.700914    
## find         6.067e+00  2.237e+00   2.712 0.006693 ** 
## special      4.168e+00  1.817e+00   2.294 0.021801 *  
## year        -1.987e+01  1.808e+04  -0.001 0.999123    
## later       -2.767e+01  1.369e+04  -0.002 0.998387    
## meeting     -3.650e+01  1.477e+04  -0.002 0.998028    
## pick        -2.914e-01  1.736e+00  -0.168 0.866696    
## good         4.275e-01  1.182e+00   0.362 0.717475    
## part         3.195e+00  1.906e+00   1.676 0.093655 .  
## come        -1.513e+00  1.522e+00  -0.995 0.319926    
## check        1.616e+00  1.443e+00   1.120 0.262676    
## nice        -1.842e+01  1.172e+04  -0.002 0.998746    
## said        -2.075e+01  7.676e+03  -0.003 0.997843    
## awarded      1.309e+01  1.026e+04   0.001 0.998982    
## day          2.168e+00  8.910e-01   2.433 0.014966 *  
## hear         2.582e+00  1.960e+00   1.317 0.187690    
## money        3.064e+00  1.786e+00   1.715 0.086347 .  
## babe         4.156e+00  1.203e+00   3.454 0.000552 ***
## something   -1.995e+01  1.174e+04  -0.002 0.998645    
## wanna       -4.300e+00  3.763e+00  -1.143 0.253161    
## waiting      1.914e+00  1.440e+00   1.329 0.183828    
## cool        -1.503e+00  1.804e+00  -0.833 0.404731    
## thats       -2.782e+01  1.400e+04  -0.002 0.998415    
## much        -5.297e+00  2.528e+00  -2.095 0.036158 *  
## job         -2.124e+01  1.146e+04  -0.002 0.998520    
## looking      4.744e+00  5.366e+00   0.884 0.376609    
## stop         5.784e+00  1.775e+00   3.259 0.001118 ** 
## one          9.137e-01  1.243e+00   0.735 0.462169    
## real        -1.356e+00  5.764e+00  -0.235 0.814043    
## bed         -2.168e+01  1.741e+04  -0.001 0.999006    
## another     -1.211e+01  4.878e+03  -0.002 0.998019    
## late         1.948e+00  1.325e+00   1.470 0.141491    
## night       -2.294e+00  2.022e+00  -1.135 0.256417    
## smile       -1.187e+01  9.307e+03  -0.001 0.998983    
## someone     -7.129e+00  3.003e+00  -2.374 0.017594 *  
## guaranteed   2.356e+01  1.124e+04   0.002 0.998328    
## service      4.559e+00  2.236e+00   2.039 0.041485 *  
## buy          2.343e+00  1.717e+00   1.365 0.172347    
## forgot      -1.438e+01  1.419e+04  -0.001 0.999192    
## nothing     -3.641e+00  1.577e+01  -0.231 0.817483    
## long        -1.820e+01  1.581e+04  -0.001 0.999082    
## yet          2.280e-01  2.416e+00   0.094 0.924816    
## guess        2.934e+00  1.958e+00   1.499 0.133985    
## dear         1.241e+00  1.081e+00   1.148 0.250810    
## life         2.572e-01  2.356e+00   0.109 0.913047    
## lot         -1.977e+01  1.654e+04  -0.001 0.999046    
## birthday    -1.494e+01  1.762e+04  -0.001 0.999323    
## aight       -1.836e+01  2.088e+04  -0.001 0.999298    
## better      -1.689e+01  1.123e+04  -0.002 0.998800    
## people       1.848e+00  2.901e+00   0.637 0.524044    
## cos         -1.808e+01  1.289e+04  -0.001 0.998881    
## things       1.622e-01  2.685e+00   0.060 0.951823    
## contact      4.770e+00  2.382e+00   2.002 0.045289 *  
## draw         5.639e+00  1.571e+00   3.589 0.000332 ***
##  [ reached getOption("max.print") -- omitted 91 rows ]
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3074.36  on 3900  degrees of freedom
## Residual deviance:  283.73  on 3610  degrees of freedom
## AIC: 865.73
## 
## Number of Fisher Scoring iterations: 23

Analysing the summary for Logistic Regression train model, we can infer that: 1. Distribution of residuals is not so symmetrical. That is, that model is also predicting points far away from the actual observed points. 2. The model reveals that ‘call’ is the most important terms as its value of error is same as the value of error for Intercept.

Accuracy for test data.

glm.accuracy.table.All <- as.data.frame(table(Sparse_test_data$MessageLabel, GLMTest_All > 0.75))
print(paste("Accuracy of Logistic Regression is:",
            100*round(((glm.accuracy.table.All$Freq[1]+glm.accuracy.table.All$Freq[4])/nrow(Sparse_test_data)), 4),
            "%"))
## [1] "Accuracy of Logistic Regression is: 96.17 %"

ROCR Curve

library(ROCR)
Logistic_Regression_Prediction_All <- prediction(abs(GLMTest_All), Sparse_test_data$MessageLabel)
Logistic_Regression_Performance_All <- performance(Logistic_Regression_Prediction_All,"tpr","fpr")
plot(Logistic_Regression_Performance_All, colorize = TRUE, text.adj = c(-0.2,1.7))

The ROCR curve substantiates the high accuracy of the model as the closer the curve follows the left-hand border and then the top border of the ROC space, the more accurate the test.

Plot Confusion Matrix

Reference_GLM_All <- factor(c("Legitimate", "Legitimate", "Spam", "Spam"))
Prediction_GLM_All <- factor(c("Legitimate", "Spam","Legitimate","Spam"))
Y_GLM_All <- c(1415, 32, 38, 186)
ConfusionMatrixPlot_GLM_All <- data.frame(Reference_GLM_All, Prediction_GLM_All, Y_GLM_All)

# Plot
ggplot(data =  ConfusionMatrixPlot_GLM_All, mapping = aes(x = Reference_GLM_All, y = Prediction_GLM_All)) +
     geom_tile(aes(fill = Y_GLM_All), colour = "white") +
     geom_text(aes(label = sprintf("%1.0f", Y_GLM_All)), vjust = 1) +
     scale_fill_gradient(low = "yellow", high = "dark green") +
     theme_bw() + theme(legend.position = "none")

Naive Bayes Model

SMS_NB_All = naiveBayes(MessageLabel ~. , data = Sparse_train_data, laplace = 1)
SMS_NBTest_All = predict(SMS_NB_All, Sparse_test_data) 


library(gmodels)
CT <- CrossTable(SMS_NBTest_All, Sparse_test_data$MessageLabel, 
           prop.chisq = FALSE, 
           prop.t = FALSE, 
           dnn = c("Predicted", "Actual")) #Name of column
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1671 
## 
##  
##              | Actual 
##    Predicted | Legitimate |       Spam |  Row Total | 
## -------------|------------|------------|------------|
##   Legitimate |        127 |          2 |        129 | 
##              |      0.984 |      0.016 |      0.077 | 
##              |      0.088 |      0.009 |            | 
## -------------|------------|------------|------------|
##         Spam |       1320 |        222 |       1542 | 
##              |      0.856 |      0.144 |      0.923 | 
##              |      0.912 |      0.991 |            | 
## -------------|------------|------------|------------|
## Column Total |       1447 |        224 |       1671 | 
##              |      0.866 |      0.134 |            | 
## -------------|------------|------------|------------|
## 
## 

This classifier has produced a model with: 1. Precision for predicting Legitimate messages is 0.98, while for predicting spam messages is 0.144. 2. Recall for predicting Legitimate messages is 0.08, while for predicting spam messages is 0.99. Therefore, we can deduce that the probability of predicting a Legitmate message as a Spam is quite high (0.86) as compared to the probability of a spam message being predicted as a legitimate message (0.02).

Accuracy for test data.

nb.accuracy.table.all <- as.data.frame(table(Sparse_test_data$MessageLabel, SMS_NBTest_All))
print(paste("Accuracy for NB is:",
             100*round(((nb.accuracy.table.all$Freq[1]+nb.accuracy.table.all$Freq[4])/nrow(Sparse_test_data)), 4),
             "%"))
## [1] "Accuracy for NB is: 20.89 %"

Plot Confusion Matrix

Reference_NB_All <- factor(c("Legitimate", "Legitimate", "Spam", "Spam"))
Prediction_NB_All <- factor(c("Legitimate", "Spam","Legitimate","Spam"))
Y_NB_All <- c(127, 1320, 2, 222)
ConfusionMatrixPlot_NB_All <- data.frame(Reference_NB_All, Prediction_NB_All, Y_NB_All)

# Plot
ggplot(data =  ConfusionMatrixPlot_NB_All, mapping = aes(x = Reference_NB_All, y = Prediction_NB_All)) +
     geom_tile(aes(fill = Y_NB_All), colour = "white") +
     geom_text(aes(label = sprintf("%1.0f", Y_NB_All)), vjust = 1) +
     scale_fill_gradient(low = "yellow", high = "dark green") +
     theme_bw() + theme(legend.position = "none")

Setting 1: Building models based on Manually Engineered Features of Spam SMS.

Decision Tree Model

# Build a recursive partitioning decision tree.

SMS_Rpart <- rpart(formula = MessageLabel ~ free + winner + congratulation + adult + attention + ringtone, data = train_data, method = "class")

rpart.plot(SMS_Rpart, type = 4, fallen.leaves = FALSE, extra = 4)

This tree reveals that out of all these tokens, the most important token is ‘ringtone’ and the least important ones being ‘congratulation and adult’.

summary(SMS_Rpart)
## Call:
## rpart(formula = MessageLabel ~ free + winner + congratulation + 
##     adult + attention + ringtone, data = train_data, method = "class")
##   n= 3901 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.28871893      0 1.0000000 1.0000000 0.04069031
## 2 0.08221797      1 0.7112811 0.7112811 0.03507580
## 3 0.06883365      3 0.5468451 0.6424474 0.03350501
## 4 0.01000000      4 0.4780115 0.4780115 0.02924733
## 
## Variable importance
##       ringtone           free         winner      attention congratulation 
##             68             13             11              6              1 
## 
## Node number 1: 3901 observations,    complexity param=0.2887189
##   predicted class=Legitimate  expected loss=0.1340682  P(node) =1
##     class counts:  3378   523
##    probabilities: 0.866 0.134 
##   left son=2 (3204 obs) right son=3 (697 obs)
##   Primary splits:
##       ringtone       splits as  LR, improve=381.73920, (0 missing)
##       winner         splits as  LR, improve=152.27790, (0 missing)
##       free           splits as  LR, improve=135.13340, (0 missing)
##       attention      splits as  LR, improve=102.71100, (0 missing)
##       congratulation splits as  LR, improve= 14.21593, (0 missing)
##   Surrogate splits:
##       free           splits as  LR, agree=0.843, adj=0.123, (0 split)
##       winner         splits as  LR, agree=0.830, adj=0.049, (0 split)
##       congratulation splits as  LR, agree=0.823, adj=0.010, (0 split)
## 
## Node number 2: 3204 observations
##   predicted class=Legitimate  expected loss=0.03089888  P(node) =0.8213279
##     class counts:  3105    99
##    probabilities: 0.969 0.031 
## 
## Node number 3: 697 observations,    complexity param=0.08221797
##   predicted class=Spam        expected loss=0.3916786  P(node) =0.1786721
##     class counts:   273   424
##    probabilities: 0.392 0.608 
##   left son=6 (532 obs) right son=7 (165 obs)
##   Primary splits:
##       winner         splits as  LR, improve=43.9829100, (0 missing)
##       attention      splits as  LR, improve=37.9821300, (0 missing)
##       free           splits as  LR, improve=25.1773400, (0 missing)
##       congratulation splits as  LR, improve= 4.3835890, (0 missing)
##       adult          splits as  LR, improve= 0.8117561, (0 missing)
##   Surrogate splits:
##       congratulation splits as  LR, agree=0.766, adj=0.012, (0 split)
## 
## Node number 6: 532 observations,    complexity param=0.08221797
##   predicted class=Spam        expected loss=0.4906015  P(node) =0.1363753
##     class counts:   261   271
##    probabilities: 0.491 0.509 
##   left son=12 (352 obs) right son=13 (180 obs)
##   Primary splits:
##       attention splits as  LR, improve=36.011700, (0 missing)
##       free      splits as  LR, improve=35.200000, (0 missing)
##       adult     splits as  LR, improve= 1.243285, (0 missing)
##   Surrogate splits:
##       congratulation splits as  LR, agree=0.665, adj=0.011, (0 split)
## 
## Node number 7: 165 observations
##   predicted class=Spam        expected loss=0.07272727  P(node) =0.04229685
##     class counts:    12   153
##    probabilities: 0.073 0.927 
## 
## Node number 12: 352 observations,    complexity param=0.06883365
##   predicted class=Legitimate  expected loss=0.3778409  P(node) =0.09023327
##     class counts:   219   133
##    probabilities: 0.622 0.378 
##   left son=24 (294 obs) right son=25 (58 obs)
##   Primary splits:
##       free  splits as  LR, improve=25.979660, (0 missing)
##       adult splits as  LR, improve= 2.669721, (0 missing)
## 
## Node number 13: 180 observations
##   predicted class=Spam        expected loss=0.2333333  P(node) =0.04614201
##     class counts:    42   138
##    probabilities: 0.233 0.767 
## 
## Node number 24: 294 observations
##   predicted class=Legitimate  expected loss=0.292517  P(node) =0.07536529
##     class counts:   208    86
##    probabilities: 0.707 0.293 
## 
## Node number 25: 58 observations
##   predicted class=Spam        expected loss=0.1896552  P(node) =0.01486798
##     class counts:    11    47
##    probabilities: 0.190 0.810

Randome Forest Classifier

Apply Random Forest to substantiate analysis of Decision Tree by plotting the importance of each token.

train_data$MessageLabel %<>% as.factor()
train_data$Message  %<>% as.character()
train_data$free %<>% as.factor()
train_data$winner %<>% as.factor()
train_data$congratulation %<>% as.factor()
train_data$adult  %<>% as.factor()
train_data$attention   %<>% as.factor()
train_data$ringtone %<>% as.factor()

# Apply the formula for Random Forest Algorithm
SMS_RF <- MessageLabel ~ free + winner + congratulation + adult + attention + ringtone
RFSpam_Tree <- randomForest(SMS_RF, data = train_data, ntree=25, proximity = T)

# Plot the Variable Importance Plot.
ImportancePlot <- varImpPlot(RFSpam_Tree, main = "Importance of each Token") 

This plot salso expresses that the most important token amongst all is ‘Ringtone’, and the least important are ‘adult and congratulation’.

# Importance of each token in a tabular form.
importance(RFSpam_Tree)
##                MeanDecreaseGini
## free                  65.709579
## winner                74.077206
## congratulation         2.292506
## adult                  2.781112
## attention             50.865231
## ringtone             280.094146

Test the above Random Forest Model on test data and check the accuracy, precision, recall and F1.

test_data$MessageLabel %<>% as.factor()
test_data$Message  %<>% as.character()
test_data$free %<>% as.factor()
test_data$winner %<>% as.factor()
test_data$congratulation %<>% as.factor()
test_data$adult  %<>% as.factor()
test_data$attention   %<>% as.factor()
test_data$ringtone %<>% as.factor()

RFTest <- predict(RFSpam_Tree, newdata =test_data)

# Confusion Matrix
RF_Matrix <- confusionMatrix(predict(RFSpam_Tree, newdata =test_data), test_data$MessageLabel)
RF_Matrix
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Legitimate Spam
##   Legitimate       1419   84
##   Spam               28  140
##                                           
##                Accuracy : 0.933           
##                  95% CI : (0.9199, 0.9445)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6772          
##  Mcnemar's Test P-Value : 2.025e-07       
##                                           
##             Sensitivity : 0.9806          
##             Specificity : 0.6250          
##          Pos Pred Value : 0.9441          
##          Neg Pred Value : 0.8333          
##              Prevalence : 0.8659          
##          Detection Rate : 0.8492          
##    Detection Prevalence : 0.8995          
##       Balanced Accuracy : 0.8028          
##                                           
##        'Positive' Class : Legitimate      
## 
# CrossTable
CrossTable(RFTest, test_data$MessageLabel, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1671 
## 
##  
##              | test_data$MessageLabel 
##       RFTest | Legitimate |       Spam |  Row Total | 
## -------------|------------|------------|------------|
##   Legitimate |       1419 |         84 |       1503 | 
##              |      0.944 |      0.056 |      0.899 | 
##              |      0.981 |      0.375 |            | 
##              |      0.849 |      0.050 |            | 
## -------------|------------|------------|------------|
##         Spam |         28 |        140 |        168 | 
##              |      0.167 |      0.833 |      0.101 | 
##              |      0.019 |      0.625 |            | 
##              |      0.017 |      0.084 |            | 
## -------------|------------|------------|------------|
## Column Total |       1447 |        224 |       1671 | 
##              |      0.866 |      0.134 |            | 
## -------------|------------|------------|------------|
## 
## 

This classifier has produced a model with: 1. Precision for predicting Legitimate messages is 0.94, while for predicting spam messages is 0.83. 2. Recall for predicting Legitimate messages is 0.98, while for predicting spam messages is 0.61. Therefore, we can deduce that the probability of predicting a Legitmate message as a Spam is moderately high (0.17) as compared to the probability of a spam message being predicted as a legitimate message (0.06).

Accuracy for test data.

TestPredictability <- sum(RFTest == test_data$MessageLabel)/ length(test_data$MessageLabel)*100

message("Accuracy for Test Data is:")
## Accuracy for Test Data is:
print(TestPredictability)
## [1] 93.29743

Plot COnfusion Matrix

Reference_RF <- factor(c("Legitimate", "Legitimate", "Spam", "Spam"))
Prediction_RF <- factor(c("Legitimate", "Spam","Legitimate","Spam"))
Y <- c(1419, 28, 87, 137)
ConfusionMatrixPlot_RF <- data.frame(Reference_RF, Prediction_RF, Y)

# Plot
ggplot(data =  ConfusionMatrixPlot_RF, mapping = aes(x = Reference_RF, y = Prediction_RF)) +
     geom_tile(aes(fill = Y), colour = "white") +
     geom_text(aes(label = sprintf("%1.0f", Y)), vjust = 1) +
     scale_fill_gradient(low = "yellow", high = "dark green") +
     theme_bw() + theme(legend.position = "none")

Support Vector Machine

SMS_SVM <- svm(MessageLabel ~ free + winner + congratulation + adult + attention + ringtone, data = train_data, kernel = "linear", cost = 0.1, gamma = 0.1)
SVMTest <- predict(SMS_SVM, test_data)

# Confusion Matrix
SVM_Matrix <- confusionMatrix(predict(SMS_SVM, newdata = test_data), test_data$MessageLabel)
SVM_Matrix
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Legitimate Spam
##   Legitimate       1413   79
##   Spam               34  145
##                                           
##                Accuracy : 0.9324          
##                  95% CI : (0.9193, 0.9439)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6817          
##  Mcnemar's Test P-Value : 3.486e-05       
##                                           
##             Sensitivity : 0.9765          
##             Specificity : 0.6473          
##          Pos Pred Value : 0.9471          
##          Neg Pred Value : 0.8101          
##              Prevalence : 0.8659          
##          Detection Rate : 0.8456          
##    Detection Prevalence : 0.8929          
##       Balanced Accuracy : 0.8119          
##                                           
##        'Positive' Class : Legitimate      
## 
# CrossTable
CrossTable(SVMTest, test_data$MessageLabel, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1671 
## 
##  
##              | test_data$MessageLabel 
##      SVMTest | Legitimate |       Spam |  Row Total | 
## -------------|------------|------------|------------|
##   Legitimate |       1413 |         79 |       1492 | 
##              |      0.947 |      0.053 |      0.893 | 
##              |      0.977 |      0.353 |            | 
##              |      0.846 |      0.047 |            | 
## -------------|------------|------------|------------|
##         Spam |         34 |        145 |        179 | 
##              |      0.190 |      0.810 |      0.107 | 
##              |      0.023 |      0.647 |            | 
##              |      0.020 |      0.087 |            | 
## -------------|------------|------------|------------|
## Column Total |       1447 |        224 |       1671 | 
##              |      0.866 |      0.134 |            | 
## -------------|------------|------------|------------|
## 
## 

This classifier has produced a model with: 1. Precision for predicting Legitimate messages is 0.95, while for predicting spam messages is 0.8. 2. Recall for predicting Legitimate messages is 0.98, while for predicting spam messages is 0.65. Therefore, we can deduce that the probability of predicting a Legitmate message as a Spam is moderately high (0.19) as compared to the probability of a spam message being predicted as a legitimate message (0.05).

Accuracy for test data.

svm.accuracy.table <- as.data.frame(table(test_data$MessageLabel, SVMTest))
print(paste("Accuracy for SVM is:",
            100*round(((svm.accuracy.table$Freq[1]+svm.accuracy.table$Freq[4])/nrow(test_data)), 4),
            "%"))
## [1] "Accuracy for SVM is: 93.24 %"

Plot confusion matrix.

Reference_SVM <- factor(c("Legitimate", "Legitimate", "Spam", "Spam"))
Prediction_SVM <- factor(c("Legitimate", "Spam","Legitimate","Spam"))
Y_SVM <- c(1413, 34, 79, 145)
ConfusionMatrixPlot_SVM <- data.frame(Reference_SVM, Prediction_SVM, Y_SVM)

# Plot
ggplot(data =  ConfusionMatrixPlot_SVM, mapping = aes(x = Reference_SVM, y = Prediction_SVM)) +
     geom_tile(aes(fill = Y_SVM), colour = "white") +
     geom_text(aes(label = sprintf("%1.0f", Y_SVM)), vjust = 1) +
     scale_fill_gradient(low = "yellow", high = "dark green") +
     theme_bw() + theme(legend.position = "none")

Logistic regression

SMS_GLM <- glm(MessageLabel ~ free + winner + congratulation + adult + attention + ringtone, data = train_data, family = "binomial")
GLMTest <- predict(SMS_GLM, test_data, type = 'response')

#Confusion Matrix
GLM_Matrix <- table(test_data$MessageLabel, GLMTest > 0.5)
GLM_Matrix
##             
##              FALSE TRUE
##   Legitimate  1415   32
##   Spam          82  142
summary(SMS_GLM)
## 
## Call:
## glm(formula = MessageLabel ~ free + winner + congratulation + 
##     adult + attention + ringtone, family = "binomial", data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2869  -0.1661  -0.1661  -0.1661   2.9294  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -4.2768     0.1354 -31.583  < 2e-16 ***
## freey             2.2517     0.2502   9.001  < 2e-16 ***
## winnery           2.5418     0.1995  12.739  < 2e-16 ***
## congratulationy   2.2650     0.9018   2.512  0.01202 *  
## adulty            1.1240     0.3447   3.261  0.00111 ** 
## attentiony        1.4571     0.1540   9.461  < 2e-16 ***
## ringtoney         3.4235     0.1476  23.188  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3074.4  on 3900  degrees of freedom
## Residual deviance: 1402.8  on 3894  degrees of freedom
## AIC: 1416.8
## 
## Number of Fisher Scoring iterations: 6

Analysing the summary for Logistic Regression train model, we can infer that: 1. Distribution of residuals is symmetrical. That is, that model can accurately predict points that are close to the actual observed points. 2. The model reveals that ‘congratulation’ and ‘adult’ are the most least important terms as their value of error is far greater than the value of error for Intercept.

Accuracy for test data.

#table(test_data$Label, Logistic_Regression_Test > 0.75)
glm.accuracy.table <- as.data.frame(table(test_data$MessageLabel, GLMTest > 0.75))
print(paste("Accuracy of Logistic Regression is:",
            100*round(((glm.accuracy.table$Freq[1]+glm.accuracy.table$Freq[4])/nrow(test_data)), 4),
            "%"))
## [1] "Accuracy of Logistic Regression is: 92.94 %"

ROCR Curve

library(ROCR)
Logistic_Regression_Prediction <- prediction(abs(GLMTest), test_data$MessageLabel)
Logistic_Regression_Performance <- performance(Logistic_Regression_Prediction,"tpr","fpr")
plot(Logistic_Regression_Performance, colorize = TRUE, text.adj = c(-0.2,1.7))

The ROCR curve substantiates the high accuracy of the model as the closer the curve follows the left-hand border and then the top border of the ROC space, the more accurate the test.

Plot Confusion Matrix

Reference_GLM <- factor(c("Legitimate", "Legitimate", "Spam", "Spam"))
Prediction_GLM <- factor(c("Legitimate", "Spam","Legitimate","Spam"))
Y_GLM <- c(1415, 32, 82, 142)
ConfusionMatrixPlot_GLM <- data.frame(Reference_GLM, Prediction_GLM, Y_GLM)

# Plot
ggplot(data =  ConfusionMatrixPlot_GLM, mapping = aes(x = Reference_GLM, y = Prediction_GLM)) +
     geom_tile(aes(fill = Y_GLM), colour = "white") +
     geom_text(aes(label = sprintf("%1.0f", Y_GLM)), vjust = 1) +
     scale_fill_gradient(low = "yellow", high = "dark green") +
     theme_bw() + theme(legend.position = "none")

Naive Bayes Classifier

#Retain words which appear in 5 or more than 5 SMS messages.
Frequent_Terms = findFreqTerms(TDM_train_data, 5)
TDM_train_data_New = DocumentTermMatrix(Corpus_train_data, list(dictionary=Frequent_Terms))
TDM_test_data_New =  DocumentTermMatrix(Corpus_test_data, list(dictionary=Frequent_Terms))
#To write a function to convert numerics in TDms to factors of yes/no.
Convert_Numerics_To_Factors = function(num) 
  {
  num = ifelse(num > 0, 1, 0)
  num = factor(num, levels = c(0, 1), labels=c("No", "Yes"))
  return (num)
  }

#Apply above fucntion to the new TDM train and test datasets.
TDM_train_data_New = apply(TDM_train_data_New, MARGIN=2, Convert_Numerics_To_Factors)
TDM_test_data_New  = apply(TDM_test_data_New, MARGIN=2, Convert_Numerics_To_Factors)
SMS_NB = naiveBayes(MessageLabel ~ free + winner + congratulation + adult + attention + ringtone, data = train_data, laplace = 1)
SMS_NBTest = predict(SMS_NB, TDM_test_data_New)


library(gmodels)
CT <- CrossTable(SMS_NBTest, test_data$MessageLabel, 
           prop.chisq = FALSE, 
           dnn = c("Predicted", "Actual")) #Name of column
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1671 
## 
##  
##              | Actual 
##    Predicted | Legitimate |       Spam |  Row Total | 
## -------------|------------|------------|------------|
##   Legitimate |       1447 |        221 |       1668 | 
##              |      0.868 |      0.132 |      0.998 | 
##              |      1.000 |      0.987 |            | 
##              |      0.866 |      0.132 |            | 
## -------------|------------|------------|------------|
##         Spam |          0 |          3 |          3 | 
##              |      0.000 |      1.000 |      0.002 | 
##              |      0.000 |      0.013 |            | 
##              |      0.000 |      0.002 |            | 
## -------------|------------|------------|------------|
## Column Total |       1447 |        224 |       1671 | 
##              |      0.866 |      0.134 |            | 
## -------------|------------|------------|------------|
## 
## 

This classifier has produced a model with: 1. Precision for predicting Legitimate messages is 0.87, while for predicting spam messages is 1.00. 2. Recall for predicting Legitimate messages is 1.00, while for predicting spam messages is 0.013. Therefore, we can deduce that the probability of predicting a Legitmate message as a Spam is perfect (0.00) as compared to the probability of a spam message being predicted as a legitimate message (0.13).

nb.accuracy.table <- as.data.frame(table(test_data$MessageLabel, SMS_NBTest))
print(paste("Accuracy for NB is:",
             100*round(((nb.accuracy.table$Freq[1]+nb.accuracy.table$Freq[4])/nrow(test_data)), 4),
             "%"))
## [1] "Accuracy for NB is: 86.77 %"

Plot Confusion Matrix

Reference_NB_All <- factor(c("Legitimate", "Legitimate", "Spam", "Spam"))
Prediction_NB_All <- factor(c("Legitimate", "Spam","Legitimate","Spam"))
Y_NB_All <- c(1447, 0, 221, 3)
ConfusionMatrixPlot_NB_All <- data.frame(Reference_NB_All, Prediction_NB_All, Y_NB_All)

# Plot
ggplot(data =  ConfusionMatrixPlot_NB_All, mapping = aes(x = Reference_NB_All, y = Prediction_NB_All)) +
     geom_tile(aes(fill = Y_NB_All), colour = "white") +
     geom_text(aes(label = sprintf("%1.0f", Y_NB_All)), vjust = 1) +
     scale_fill_gradient(low = "yellow", high = "dark green") +
     theme_bw() + theme(legend.position = "none")